{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from lightgbm.sklearn import LGBMClassifier\n",
    "from sklearn.metrics import accuracy_score, auc, roc_auc_score\n",
    "from sklearn.metrics import precision_score, recall_score, f1_score\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import KFold\n",
    "## 0. 打印设置\n",
    "pd.set_option('display.max_columns', None)\n",
    "# pd.set_option('display.max_rows', None)  ## 显示全部结果，不带省略点\n",
    "# pd.set_option('display.width', 1000)\n",
    "pd.set_option('display.float_format', '{:.0f}'.format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 1.1 读取数据\n",
    "train_Base = pd.read_csv(r\"data/train.csv\")\n",
    "test_Base = pd.read_csv(r\"data/test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 1.2 数据合并\n",
    "# data = pd.concat([test_Base, train_Base], axis=0)\n",
    "# data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 1.3 数据清洗\n",
    "## 1.3.1 索引完善\n",
    "# data.index = range(len(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 1.4 数据探索\n",
    "## 1.4.1 空值数量\n",
    "# data.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 1.4.2 唯一值个数\n",
    "# for col in data.columns:\n",
    "#     print(col, data[col].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 1.4.3 字符串的字段，唯一值统计\n",
    "# cat_columns = data.select_dtypes(include='object').columns  \n",
    "\n",
    "# column_name = []\n",
    "# unique_value = []\n",
    " \n",
    "# for col in cat_columns:\n",
    "#     column_name.append(col)\n",
    "#     unique_value.append(data[col].nunique())\n",
    "\n",
    "# df = pd.DataFrame()\n",
    "# df['col_name'] = column_name\n",
    "# df['value'] = unique_value\n",
    "# df = df.sort_values('value', ascending=False)\n",
    " \n",
    "# df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 2 特征工程\n",
    "## 2.0 特征编码--property_damage、police_report_available\n",
    "# data['property_damage'].value_counts()\n",
    "# data['property_damage'] = data['property_damage'].map({'NO': 0, 'YES': 1, '?': 2})\n",
    "# data['property_damage'].value_counts()\n",
    "\n",
    "# data['police_report_available'].value_counts()\n",
    "# data['police_report_available'] = data['police_report_available'].map({'NO': 0, 'YES': 1, '?': 2})\n",
    "# data['police_report_available'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ## 2.2 去除无关的特征\n",
    "# data.drop(['policy_id'], axis=1, inplace=True)\n",
    "# data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 2.1 特征编码--加入一个新的日期编码\n",
    "# policy_bind_date, incident_date\n",
    "# data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'])\n",
    "# data['incident_date'] = pd.to_datetime(data['incident_date'])\n",
    " \n",
    "# # 查看最大日期，最小日期\n",
    "# data['policy_bind_date'].min() # 1990-01-08\n",
    "# data['policy_bind_date'].max() # 2015-02-22\n",
    "\n",
    "# data['incident_date'].min() # 2015-01-01\n",
    "# data['incident_date'].max() # 2015-03-01\n",
    "\n",
    "# base_date = data['policy_bind_date'].min()\n",
    "# # 转换为date_diff\n",
    "# data['policy_bind_date_diff'] = (data['policy_bind_date'] - base_date).dt.days\n",
    "# data['incident_date_diff'] = (data['incident_date'] - base_date).dt.days\n",
    "\n",
    "# #去掉原始日期字段 policy_bind_date    incident_date\n",
    "# data.drop(['policy_bind_date', 'incident_date'], axis=1, inplace=True)\n",
    "# data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 2.3 标签编码\n",
    "cat_columns = train_Base.select_dtypes(include=['object']).columns\n",
    "le = LabelEncoder()\n",
    "for col in cat_columns:\n",
    "    train_Base[col] = le.fit_transform(train_Base[col])\n",
    "    test_Base[col] = le.fit_transform(test_Base[col])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['policy_bind_date', 'policy_state', 'policy_csl', 'insured_sex',\n",
       "       'insured_education_level', 'insured_occupation', 'insured_hobbies',\n",
       "       'insured_relationship', 'incident_date', 'incident_type',\n",
       "       'collision_type', 'incident_severity', 'authorities_contacted',\n",
       "       'incident_state', 'incident_city', 'property_damage',\n",
       "       'police_report_available', 'auto_make', 'auto_model'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 2.4 分箱编码\n",
    "\n",
    "# ## 1）age分箱\n",
    "# for x in range(10,70,10):\n",
    "#     train_Base[train_Base['age'].between(x,x+10)].loc[:,['age']]=x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## 3. 数据集切分\n",
    "## 3.1 切分训练集和测试集\n",
    "# train = data[data['fraud'].notnull()]\n",
    "# test = data[data['fraud'].isnull()]\n",
    "X_train = train_Base.drop(columns=['policy_id', 'fraud'])\n",
    "Y_train = train_Base['fraud']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 3.2 训练集中，训练集和验证集的划分\n",
    "\n",
    "# x_train, x_train_01 = train_test_split(train.drop(['fraud'],axis=1), test_size=0.2, random_state=42)  # 25% of remaining data as validation set  \n",
    "# y_train, y_train_01 = train_test_split(train['fraud'], test_size=0.2, random_state=42)  # Split labels accordingly  \n",
    "\n",
    "# x_train, x_train_01, y_train, y_train_01 = train_test_split(X, Y, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 4. 模型训练\n",
    "## 4.1 建立模型\n",
    "# gbm = LGBMClassifier(n_estimators=600, learning_rate=0.01, boosting_type='gbdt',  ## 模型训练超参数 调优参考：https://blog.51cto.com/u_16213313/7201851\n",
    "#                      objective='binary',   ## LGBMClassifier详解： https://blog.csdn.net/yeshang_lady/article/details/118638269\n",
    "#                      max_depth=-1,\n",
    "#                      random_state=2022,\n",
    "#                      metric='auc')\n",
    "\n",
    "gbm = LGBMClassifier(num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',\n",
    "            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2022,\n",
    "            n_estimators=300, subsample=1, colsample_bytree=1,)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-------------模型评估第1次-------------\n",
      "auc值: 0.8205222961320522\n",
      "Accuracy: 0.7785714285714286\n",
      "Precision: 0.6923076923076923\n",
      "Recall: 0.43902439024390244\n",
      "F1 Score: 0.5373134328358209\n",
      "-------------模型评估第2次-------------\n",
      "auc值: 0.7914816870144283\n",
      "Accuracy: 0.7928571428571428\n",
      "Precision: 0.5862068965517241\n",
      "Recall: 0.5\n",
      "F1 Score: 0.5396825396825397\n",
      "-------------模型评估第3次-------------\n",
      "auc值: 0.8418329637841834\n",
      "Accuracy: 0.8\n",
      "Precision: 0.6585365853658537\n",
      "Recall: 0.6585365853658537\n",
      "F1 Score: 0.6585365853658537\n",
      "-------------模型评估第4次-------------\n",
      "auc值: 0.8658536585365854\n",
      "Accuracy: 0.8142857142857143\n",
      "Precision: 0.6829268292682927\n",
      "Recall: 0.6829268292682927\n",
      "F1 Score: 0.6829268292682927\n",
      "-------------模型评估第5次-------------\n",
      "auc值: 0.8171695402298851\n",
      "Accuracy: 0.8428571428571429\n",
      "Precision: 0.5416666666666666\n",
      "Recall: 0.5416666666666666\n",
      "F1 Score: 0.5416666666666666\n",
      "-------------模型评估最后结果-------------\n",
      "auc平均值： 0.8273720291394269\n",
      "accuracy平均值： 0.8057142857142858\n",
      "precision平均值： 0.6323289340320458\n",
      "recall平均值： 0.5644308943089431\n",
      "f1平均值： 0.5920252107638347\n"
     ]
    }
   ],
   "source": [
    "## 4.2 交叉验证训练\n",
    "\n",
    "n_folds=5\n",
    "auc_mean=0\n",
    "accuracy_mean = 0\n",
    "precision_mean = 0\n",
    "recall_mean = 0\n",
    "f1_mean = 0\n",
    "pred01 = []\n",
    "\n",
    "\n",
    "i=1\n",
    "\n",
    "\n",
    "kf = KFold(n_splits=n_folds, shuffle=True, random_state=2022)\n",
    "for index_train_train, index_train_test in kf.split(X_train):\n",
    "    #     print(index_train)\n",
    "    #     print('------------------')\n",
    "    #     print(index_yanzheng)\n",
    "    x_train_train = X_train.iloc[index_train_train]\n",
    "    y_train_train = Y_train.iloc[index_train_train]\n",
    "    #     print(train_X)\n",
    "    x_train_test = X_train.iloc[index_train_test]\n",
    "    y_train_test = Y_train.iloc[index_train_test]\n",
    "    ## 3.2 模型训练\n",
    "    gbm.fit(x_train_train, y_train_train)\n",
    "    ## 3.3 模型预测\n",
    "    y_train_test_pred = gbm.predict_proba(x_train_test)[:, 1]\n",
    "    \n",
    "\n",
    "    ## 3.4 模型评估指标计算\n",
    "    print(f'-------------模型评估第{i}次-------------')\n",
    "    auc = roc_auc_score(y_train_test, y_train_test_pred)\n",
    "    print(\"auc值:\", auc)\n",
    "    \n",
    "    y_train_test_pred[y_train_test_pred > 0.5] = 1\n",
    "    y_train_test_pred[y_train_test_pred <= 0.5] = 0\n",
    "\n",
    "    accuracy = accuracy_score(y_train_test, y_train_test_pred)  ## 计算准确率\n",
    "    precision = precision_score(y_train_test, y_train_test_pred) # 计算精确率\n",
    "    recall = recall_score(y_train_test, y_train_test_pred) # 计算召回率\n",
    "    f1 = f1_score(y_train_test, y_train_test_pred) # 计算F1值\n",
    "    \n",
    "    print(\"Accuracy:\", accuracy)\n",
    "    print(\"Precision:\", precision)\n",
    "    print(\"Recall:\", recall)\n",
    "    print(\"F1 Score:\", f1)\n",
    "\n",
    "    auc_mean += auc / n_folds\n",
    "    accuracy_mean +=accuracy / n_folds\n",
    "    precision_mean +=precision / n_folds\n",
    "    recall_mean += recall / n_folds\n",
    "    f1_mean += f1 / n_folds\n",
    "    \n",
    "    ## 3.5 测试集，预测并记录结果，最后求平均 \n",
    "    y_test_pred = gbm.predict_proba(test_Base.drop(['policy_id'],axis=1))[:,1]\n",
    "    pred01.append(y_test_pred)\n",
    " \n",
    "    i += 1\n",
    "\n",
    "print(f'-------------模型评估最后结果-------------')\n",
    "print(f'auc平均值： {auc_mean}')\n",
    "print(f'accuracy平均值： {accuracy_mean}')\n",
    "print(f'precision平均值： {precision_mean}')\n",
    "print(f'recall平均值： {recall_mean}')\n",
    "print(f'f1平均值： {f1_mean}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# 4.2 模型训练\n",
    "## train.drop(['fraud'],axis=1) ## axis=0 表示行，axis=1 表示列\n",
    "# gbm.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 4.3 模型预测，以proba进行提交，结果会更好\n",
    "# y_train_01_pred = gbm.predict_proba(x_train_01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "auc: 0.8273720291394269\n",
      "Accuracy: 0.8057142857142858\n",
      "Precision: 0.6323289340320458\n",
      "Recall: 0.5644308943089431\n",
      "F1 Score: 0.5920252107638347\n"
     ]
    }
   ],
   "source": [
    "## 5. 模型评估\n",
    "\n",
    "auc = auc_mean\n",
    "accuracy= accuracy_mean  ## 计算准确率\n",
    "precision = precision_mean  # 计算精确率\n",
    "recall = recall_mean # 计算召回率\n",
    "f1 = f1_mean # 计算F1值\n",
    "\n",
    "\n",
    "# 输出计算得到的准确率、召回率和F1值\n",
    "print(\"auc:\", auc)\n",
    "v_code = str(round(auc,5)).split('.')[1]\n",
    "print(\"Accuracy:\", accuracy)\n",
    "print(\"Precision:\", precision)\n",
    "print(\"Recall:\", recall)\n",
    "print(\"F1 Score:\", f1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ## 5.1 模型命名，版本控制\n",
    "model_name=f'model_0_{v_code}_base'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,\n",
       "       0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,\n",
       "       0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "       0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1.,\n",
       "       0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "       0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,\n",
       "       0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,\n",
       "       0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,\n",
       "       0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1.,\n",
       "       1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0.,\n",
       "       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,\n",
       "       1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,\n",
       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.,\n",
       "       0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,\n",
       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,\n",
       "       0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,\n",
       "       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0.,\n",
       "       1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 6 结果输出\n",
    "\n",
    "## test集预测，k交叉已预测\n",
    "## y_test_pred = gbm.predict_proba(test_Base.drop(['policy_id'],axis=1))\n",
    "\n",
    "## 6.0 测试集结果，求平均\n",
    "pred02 = sum(pred01) / n_folds  ## pred01预测集的中间结果\n",
    "\n",
    "pred02[ pred02 > 0.5] = '1'\n",
    "pred02[ pred02 <= 0.5] = '0'\n",
    "pred02"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6.2 输出结果\n",
    "result = pd.read_csv('./data/submission.csv')\n",
    "result['fraud'] = pred02\n",
    "result.to_csv(f'./data/{model_name}.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model_name</th>\n",
       "      <th>update_time</th>\n",
       "      <th>Accuracy</th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "      <th>F1 Score</th>\n",
       "      <th>auc</th>\n",
       "      <th>sub_score</th>\n",
       "      <th>update_content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>model_0_8496_base</td>\n",
       "      <td>2024/4/26 11:55</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>base model</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>model_0_8914_base</td>\n",
       "      <td>2024/4/26 13:03</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>delete# 1.2 1.3 1.4 2.0(date-diff) ; add #3.1(...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>model_0_8351_base</td>\n",
       "      <td>2024/4/26 13:19</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>base on model_0_8496_base; update ## 3.2 -- te...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          model_name      update_time  Accuracy  Precision  Recall  F1 Score  \\\n",
       "0  model_0_8496_base  2024/4/26 11:55         1          1       1         1   \n",
       "1  model_0_8914_base  2024/4/26 13:03         1          1       1         1   \n",
       "2  model_0_8351_base  2024/4/26 13:19         1          1       1         1   \n",
       "\n",
       "   auc  sub_score                                     update_content  \n",
       "0    1          1                                         base model  \n",
       "1    1          1  delete# 1.2 1.3 1.4 2.0(date-diff) ; add #3.1(...  \n",
       "2    1          1  base on model_0_8496_base; update ## 3.2 -- te...  "
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 7 模型评估结果输出\n",
    "evalue_result=pd.read_csv('./data/evalue_result.csv', encoding='utf-8')\n",
    "evalue_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 5.1 模型评估结果输出\n",
    "import datetime \n",
    "\n",
    "new_row = {'model_name': model_name, 'update_time': datetime.datetime.now() , 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall\n",
    " , 'F1 Score': f1, 'auc': auc, 'sub_score': 0.8128, 'update_content': 'base on baseModel; delete #3.2(data-split); add ## 4.2(k_evalue)'}  \n",
    "evalue_result.loc[len(evalue_result.index)] = new_row \n",
    "evalue_result\n",
    "evalue_result.to_csv('./data/evalue_result.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
